Cluster analysis

Author

Alexander Lawless

Published

20-02-2025

expand for setup code chunk
knitr::opts_chunk$set(echo = FALSE, warning=FALSE, message=FALSE, fig.width =12, fig.height = 9)

library(tidyverse)
library(GGally)
library(factoextra)
library(plotly)
library(tidyLPA)
library(mclust)


# Load the data and saved objects ----

key_measures_rep_sub_sample <-
  read_csv("key_measures_rep_sub_sample.csv") |>
  select(-person_id) |>
  as.data.frame()

load("cluster_analysis.RData")

load("cluster_analysis_evaluations.RData")

# Functions ----

## kmeans 
wssplot <- function(data, nc = 10, seed = 1234, max_iter = 100) {
  wss <- numeric(nc)
  wss[1] <- (nrow(data) - 1) * sum(apply(data, 2, var))
  set.seed(seed)
  for (i in 2:nc) {
    wss[i] <- sum(kmeans(data, centers = i, nstart = 25, iter.max = max_iter)$withinss)
  }
  plot(1:nc, wss, type = 'b', xlab = 'Number of Clusters', ylab = 'Within Group Sum of Squares',
       main = 'Elbow Method Plot to Find Optimal Number of Clusters', frame.plot = TRUE,
       col = 'blue', lwd = 1.5)
}

link_model_output <- function(model, model_id) {

  model$tot.withinss |>
    tibble() |>
    rename(tot.withinss = 1) |>
    mutate(id = model_id)
}

## LPA

## Evaluation

# Function to subsample data
subsample_data <- function(data, fraction = 0.25) {
  set.seed(123) # For reproducibility
  sample_indices <- sample(1:nrow(data), size = floor(fraction * nrow(data)))
  return(data[sample_indices, ])
}

# Evaluate clustering performance on a subsample
evaluate_clustering_subsample <- function(data, clusters, fraction = 0.25) {
  # Subsample the data
  subsample <- subsample_data(data, fraction)
  subsample_clusters <- clusters[1:nrow(subsample)]

  # Ensure clusters are numeric
  subsample_clusters <- as.numeric(subsample_clusters)

  # Calculate silhouette score
  dist_matrix <- dist(subsample)
  sil <- silhouette(subsample_clusters, dist_matrix)
  silhouette_score <- mean(sil[, 3])

  # Calculate Davies-Bouldin index
  db_index <- index.DB(subsample, subsample_clusters)$DB

  # Calculate within-cluster sum of squares (WCSS)
  wcss <- sum(kmeans(subsample, centers = length(unique(subsample_clusters)))$withinss)

  return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))
}

# Function to identify the optimum model
select_optimum_model <- function(kmeans_eval, lpa_eval, dbscan_eval) {
  # Combine evaluation metrics into a data frame
  eval_metrics <- data.frame(
    model = c("K-means", "LPA", "DBSCAN"),
    silhouette_score = c(kmeans_eval$silhouette_score, lpa_eval$silhouette_score, dbscan_eval$silhouette_score),
    db_index = c(kmeans_eval$db_index, lpa_eval$db_index, dbscan_eval$db_index),
    wcss = c(kmeans_eval$wcss, lpa_eval$wcss, dbscan_eval$wcss)
  )

  # Normalize the metrics for comparison (higher silhouette score is better, lower DB index and WCSS are better)
  eval_metrics$normalized_silhouette <- eval_metrics$silhouette_score / max(eval_metrics$silhouette_score)
  eval_metrics$normalized_db_index <- min(eval_metrics$db_index) / eval_metrics$db_index
  eval_metrics$normalized_wcss <- min(eval_metrics$wcss) / eval_metrics$wcss

  # Calculate a combined score (you can adjust the weights as needed)
  eval_metrics$combined_score <- eval_metrics$normalized_silhouette + eval_metrics$normalized_db_index + eval_metrics$normalized_wcss

  # Identify the model with the highest combined score
  optimum_model <- eval_metrics[which.max(eval_metrics$combined_score), "model"]

  return(optimum_model)
}


# Look ups ----

key_measure_lookup <-
  tribble(
    ~variable, ~variable_clean,
    "person_id"                 , "0. Person ID",
    "care_contacts"             , "A. Care contacts",
    "referrals"                 , "B. Referrals",
    "length_care"               , "C. Care length",
    "average_daily_contacts"    , "D. Average daily contacts",
    "contact_prop_aft_midpoint" , "E. Contact proportion after midpoint",
    "team_input_count"          , "F. Team input count",
    "intermittent_care_periods" , "G. Intermittent care periods",
    "specialist_contact_prop"   , "H. Specialist care proportion",
    "remote_contact_prop"       , "I. Remote care proportion",
    "avg_contact_duration"      , "J. Average contact duration",
    "acute_admissions_2223"     , "K. Acute admission count"
  )

Cluster analysis approaches

Step 1: Performing K-means Clustering

Create K-means Clustering models:

set.seed(123)  # For reproducibility
kmeans_result_4 <-  kmeans(key_measures_rep_sub_sample, centers = 4, nstart = 100, iter.max = 100) 
kmeans_result_5 <-  kmeans(key_measures_rep_sub_sample, centers = 5, nstart = 100, iter.max = 100) 
kmeans_result_6 <-  kmeans(key_measures_rep_sub_sample, centers = 6, nstart = 100, iter.max = 100) 
kmeans_result_7 <-  kmeans(key_measures_rep_sub_sample, centers = 7, nstart = 100, iter.max = 100) 
kmeans_result_8 <-  kmeans(key_measures_rep_sub_sample, centers = 8, nstart = 100, iter.max = 100) 
kmeans_result_9 <-  kmeans(key_measures_rep_sub_sample, centers = 9, nstart = 100, iter.max = 100) 
kmeans_result_10 <- kmeans(key_measures_rep_sub_sample, centers = 10, nstart = 100, iter.max = 100)
kmeans_result_11 <- kmeans(key_measures_rep_sub_sample, centers = 11, nstart = 100, iter.max = 100)
kmeans_result_12 <- kmeans(key_measures_rep_sub_sample, centers = 12, nstart = 100, iter.max = 100)

Identify the number of clusters to derive

## 1. **Determine the Optimal Number of Clusters**:
# fviz_nbclust(key_measures_scaled_head, kmeans, method = "wss")  # Elbow Method

wssplot(key_measures_rep_sub_sample)

# K-means goodness of fit
## The total within-cluster sum of squares (WSS) measures the compactness of the clusters. Lower values indicate better fit.
## The more clusters derived, the lower the sum of squares
## Trade of between fit and model simplicity?

link_model_output(kmeans_result_4, "clusters_4") |>
  union_all(link_model_output(kmeans_result_5 , "clusters_5")) |>
  union_all(link_model_output(kmeans_result_6 , "clusters_6")) |>
  union_all(link_model_output(kmeans_result_7 , "clusters_7")) |>
  union_all(link_model_output(kmeans_result_8 , "clusters_8")) |>
  union_all(link_model_output(kmeans_result_9 , "clusters_9")) |>
  union_all(link_model_output(kmeans_result_10, "clusters_10")) |>
  union_all(link_model_output(kmeans_result_11, "clusters_11")) |>
  union_all(link_model_output(kmeans_result_12, "clusters_12")) |>
  mutate(rn = row_number()) |>

  ggplot(aes(x = reorder(id, rn), y = tot.withinss)) +
  geom_col(width = 0.01) +
  geom_point(size = 5) +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  labs(x = "n_clusters",
       y = "Total within-cluster sum of squares (WSS)",
       title = "Comparing sum of squares Goodness of Fit measure",
       subtitle = "K-means cluster models"
       )

Visualise K-means models:

## 3. **Visualize the Clusters**:
fviz_cluster(kmeans_result_6,
             data = key_measures_rep_sub_sample,
             geom = "point",
             #ellipse.level = 99,
             ellipse.alpha = 0.1,
             pointsize = 0.5
             ) +
  #geom_point(aes(shape = cluster), size = 1, alpha = 0.2) +
  labs(title = "K-means Clustering Results",
     x = "Variable 1",
     y = "Variable 2") +
  theme_minimal()

Pairwise Scatter Plot Matrix

# Perform Principal Component Analysis (PCA)
pca_result <-
  prcomp(key_measures_rep_sub_sample, scale. = TRUE)

# Create a dataframe with the principal components
pca_data <-
  as.data.frame(pca_result$x) |>
  mutate(cluster = as.factor(kmeans_result_6$cluster)) # Add cluster assignments to the dataframe

# Create pairwise scatter plot matrix
ggpairs_plot <- ggpairs(pca_data, aes(color = cluster, alpha = 0.5))
#ggpairs_plot

3D plot of top 3 principle components

# Create 3D plot
plot_ly(pca_data_3d, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cluster
        #colors = c('#1f77b4', '#ff7f0e', '#2ca02c')
        ) %>%
  add_markers(size = 1) %>%
  layout(scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')),
         title = '3D Plot of Principal Components')

Plot classifications

kmeans_classification_means <-
  bind_cols(key_measures_rep_sub_sample, classification = kmeans_result_6$cluster) |>
  group_by(classification) |>
  summarise_all(funs(mean)) |>
  mutate(classification = paste0("Profile ", 1:6)) |>
  mutate_at(vars(-classification), function(x) round(x, 3)) |>
  rename(profile = classification)

kmeans_classification_means_plot_free <-
  kmeans_classification_means |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile, scale = "free_y") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Mean Zscore by variable and cluster/profile",
       subtitle = "K-means cluster analysis - 6 clusters")


kmeans_classification_means_plot_fixed <-
  kmeans_classification_means |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Mean Zscore by variable and cluster/profile",
       subtitle = "K-means cluster analysis - 6 clusters")
kmeans_classification_means_plot_fixed

kmeans_classification_means_plot_free

Step 2: Performing Model-Based Clustering

Test number of clusters in model based approach (LPA)

Function:

explore_model_fit <- function(df,
                              n_profiles_range = 1:12,
                              model_names = c("EII","VII","EEI","VEI","EVI","VVI","EEE",
                                              "VEE","EVE","VVE","EEV","VEV","EVV","VVV"
                                              )) {
  set.seed(123)
  x <- mclustBIC(df, G = n_profiles_range, modelNames = model_names)
  y <- x |>
    as.data.frame.matrix() |>
    rownames_to_column("n_profiles") |>
    rename(
      `EII: spherical, equal volume`                             = EII,
      `VII: spherical, unequal volume`                           = VII,
      `EEI: diagonal, equal volume and shape`                    = EEI,
      `VEI: diagonal, varying volume, equal shape`               = VEI,
      `EVI: diagonal, equal volume, varying shape`               = EVI,
      `VVI: diagonal, varying volume and shape`                  = VVI,
      `EEE: ellipsoidal, equal volume, shape, and orientation`   = EEE,
      `VEE: ellipsoidal, equal shape and orientation`            = VEE,
      `EVE: ellipsoidal, equal volume and orientation`           = EVE,
      `VVE: ellipsoidal, equal orientation`                      = VVE,
      `EEV: ellipsoidal, equal volume and equal shape`           = EEV,
      `VEV: ellipsoidal, equal shape`                            = VEV,
      `EVV: ellipsoidal, equal volume`                           = EVV,
      `VVV: ellipsoidal, varying volume, shape, and orientation` = VVV
      )
  y
}

fit_output <- explore_model_fit(key_measures_rep_sub_sample, n_profiles_range = 1:12)

# Elbow plot
to_plot <-
  fit_output |>
  gather(`Covariance matrix structure`, val, -n_profiles) |>
  mutate(`Covariance matrix structure` = as.factor(`Covariance matrix structure`),
         val = abs(val)) # this is to make the BIC values positive (to align with more common formula / interpretation of BIC)

Elbow plot to test n_clusters

to_plot |>
  tibble() |>
  mutate(n_profiles = as.numeric(n_profiles)) |>

  ggplot(aes(x = n_profiles,
             y = val,
             color = `Covariance matrix structure`,
             group = `Covariance matrix structure`,
             #label = `Covariance matrix structure`
             )) +
  geom_line() +
  geom_point() +
  #geom_label() +
  scale_x_continuous(breaks = scales::pretty_breaks()) +
  theme_minimal() +
  labs(x = "No. clusters",
       y = "BIC (smaller value is better)",
       title = "Comparing LPA model and number of cluster selection",
       subtitle = "Model-based clustering - LPA")

Select model

selected_model_eev_6 <- Mclust(key_measures_scaled_head, G = 6, modelNames = "EEV")
print(summary(selected_model_eev_6))
---------------------------------------------------- 
Gaussian finite mixture model fitted by EM algorithm 
---------------------------------------------------- 

Mclust EEV (ellipsoidal, equal volume and shape) model with 6 components: 

 log-likelihood     n  df      BIC      ICL
      -904691.7 87775 412 -1814073 -1831042

Clustering table:
    1     2     3     4     5     6 
12194  6143 51919  4713  8779  4027 

Plot model classifications

# Plot model classifications
dff <- bind_cols(key_measures_scaled_head, classification = selected_model_eev_6$classification)

proc_df <-
  dff |>
  mutate_at(vars(-classification), scale) |>
  group_by(classification) |>
  summarise_all(funs(mean)) |>
  mutate(classification = paste0("Profile ", 1:6)) |>
  mutate_at(vars(-classification), function(x) round(x, 3)) |>
  rename(profile = classification)

lpa_profiles_mean_fixed <-
  proc_df |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile#, scales = "free_y"
             ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")
        ) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Plotting LPA profiles by variable and average Zscore",
       subtitle = "Model-based clustering - LPA")

lpa_profiles_mean_free <-
  proc_df |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile, scales = "free_y"
             ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")
        ) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Plotting LPA profiles by variable and average Zscore",
       subtitle = "Model-based clustering - LPA")
lpa_profiles_mean_fixed

lpa_profiles_mean_free

Step 3: Performing Density-Based Clustering

Optimal value of “eps” parameter

dbscan::kNNdistplot(key_measures_rep_sub_sample, k =  5)
abline(h = 1.8, lty = 2)

Plot DBSCAN results

# Print DBSCAN
#print(db)

# Plot clusters
dbscan_plot_fviz

Step 4: Evaluate Clustering Models

# Isolate cluster vectors from each model
kmeans_clusters <- kmeans_result_6$cluster
lpa_clusters    <- selected_model_eev_6$classification
dbscan_clusters <- db$cluster

# Function to sub-sample data
subsample_data <- function(data, fraction = 0.25) {
  set.seed(123) # For reproducibility
  sample_indices <- sample(1:nrow(data), size = floor(fraction * nrow(data)))
  return(data[sample_indices, ])
}

# Evaluate clustering performance on a sub-sample
evaluate_clustering_subsample <- function(data, clusters, fraction = 0.25) {
  # Sub-sample the data
  subsample <- subsample_data(data, fraction)
  subsample_clusters <- clusters[1:nrow(subsample)]

  # Ensure clusters are numeric
  subsample_clusters <- as.numeric(subsample_clusters)

  # Calculate silhouette score
  dist_matrix <- dist(subsample)
  sil <- silhouette(subsample_clusters, dist_matrix)
  silhouette_score <- mean(sil[, 3])

  # Calculate Davies-Bouldin index
  db_index <- index.DB(subsample, subsample_clusters)$DB

  # Calculate within-cluster sum of squares (WCSS)
  wcss <- sum(kmeans(subsample, centers = length(unique(subsample_clusters)))$withinss)

  return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))
}

# Calculate evaluation metrics for each model using a subsample
kmeans_eval <- evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters)
lpa_eval    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters)
dbscan_eval <- evaluate_clustering_subsample(key_measures_rep_sub_sample, dbscan_clusters)

# Function to identify the optimum model
select_optimum_model <- function(kmeans_eval, lpa_eval, dbscan_eval) {
  # Combine evaluation metrics into a data frame
  eval_metrics <- data.frame(
    model = c("K-means", "LPA", "DBSCAN"),
    silhouette_score = c(kmeans_eval$silhouette_score, lpa_eval$silhouette_score, dbscan_eval$silhouette_score),
    db_index = c(kmeans_eval$db_index, lpa_eval$db_index, dbscan_eval$db_index),
    wcss = c(kmeans_eval$wcss, lpa_eval$wcss, dbscan_eval$wcss)
  )

  # Normalize the metrics for comparison (higher silhouette score is better, lower DB index and WCSS are better)
  eval_metrics$normalized_silhouette <- eval_metrics$silhouette_score / max(eval_metrics$silhouette_score)
  eval_metrics$normalized_db_index <- min(eval_metrics$db_index) / eval_metrics$db_index
  eval_metrics$normalized_wcss <- min(eval_metrics$wcss) / eval_metrics$wcss

  # Calculate a combined score (you can adjust the weights as needed)
  eval_metrics$combined_score <- eval_metrics$normalized_silhouette + eval_metrics$normalized_db_index + eval_metrics$normalized_wcss

  # Identify the model with the highest combined score
  optimum_model <- eval_metrics[which.max(eval_metrics$combined_score), "model"]

  return(optimum_model)
}
# Identify the optimum model
optimum_model <- select_optimum_model(kmeans_eval, lpa_eval, dbscan_eval)
print(paste("The optimum model is:", optimum_model))
[1] "The optimum model is: DBSCAN"